In [ ]:
 
In [17]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

1. first contact with the data

1.1. reading the data

In [18]:
df_CSF_inf = pd.read_csv('data/Inflammation_CSF_Etter.csv',sep=';',skiprows=[0,1,2,4,5,6] , index_col=0)

df_CSF_inf.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)
df_CSF_inf_metadatas = df_CSF_inf.loc[ ['LOD','Missing Data freq.','Normalization'] , :].copy()
df_CSF_inf.drop(index=['LOD','Missing Data freq.','Normalization', np.nan] , inplace=True)

inflammationMolecules = list( df_CSF_inf.columns)
for m in inflammationMolecules:
    df_CSF_inf[m] = df_CSF_inf[m].astype(float)

df_CSF_inf.head()
Out[18]:
IL8 VEGFA CD8A MCP-3 GDNF CDCP1 CD244 IL7 OPG LAP TGF-beta-1 ... TNFRSF9 NT-3 TWEAK CCL20 ST1A1 STAMBP IL5 ADA TNFB CSF-1
Assay
6346 6.92863 9.45613 4.64569 0.92595 2.14514 2.50974 1.74502 0.47182 9.31382 2.56710 ... 2.48836 1.16383 9.19952 1.46232 1.42846 0.73271 1.24755 3.59264 1.27559 8.16527
4995 7.81903 10.04223 5.94773 1.44246 2.16450 3.10576 2.28670 1.07677 10.43554 5.04245 ... 3.54916 1.80602 9.73808 2.13246 1.65259 1.16748 1.90812 3.87319 2.47669 8.96652
5204 7.45676 9.50868 4.45154 0.80615 1.91588 2.68400 1.79186 0.46683 9.50804 3.75175 ... 2.52795 0.88536 9.41759 1.52814 1.09635 0.93010 0.99443 4.14049 1.43471 8.33000
5652 7.33122 9.83424 7.53968 0.93826 2.48501 2.15495 2.35870 0.60660 9.49353 2.62136 ... 5.17659 1.43627 8.73457 1.69590 1.12717 1.13360 1.43174 4.20414 4.90506 8.23255
NC029 6.75276 8.01530 3.46982 0.83231 1.73115 1.90936 1.62248 0.22653 8.02930 0.88580 ... 1.87317 0.81587 8.18665 1.47131 1.27757 0.52831 0.97514 3.29188 1.04656 7.56799

5 rows × 92 columns

In [19]:
df_CSF_neuro = pd.read_csv('data/Neurology_CSF_Etter.csv',sep=';',skiprows=[0,1,2,4,5,6] , index_col=0)

df_CSF_neuro.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)
df_CSF_neuro_metadatas = df_CSF_neuro.loc[ ['LOD','Missing Data freq.','Normalization'] , :].copy()
df_CSF_neuro.drop(index=['LOD','Missing Data freq.','Normalization', np.nan] , inplace=True)

neuroMolecules = list( df_CSF_neuro.columns )

for m in neuroMolecules:
    df_CSF_neuro[m] = df_CSF_neuro[m].astype(float)

df_CSF_neuro.head()
Out[19]:
NMNAT1 NRP2 MAPT CADM3 GDNF UNC5C VWC2 Siglec-9 CLM-6 EZR ... Dkk-4 EDA2R LAT NTRK3 LAIR-2 MANF TN-R CD200R1 Nr-CAM KYNU
Assay
6346 1.96665 7.09192 3.77791 7.91794 2.31497 4.42140 6.47501 4.05874 4.45070 3.10702 ... 3.20446 7.67390 1.05485 6.97243 8.58782 2.71673 5.78930 1.06282 11.26572 3.80794
4995 2.34375 7.47933 4.56933 8.15273 2.82215 4.79441 7.15407 4.73644 5.04218 3.18018 ... 3.66164 8.46000 1.69681 7.40388 8.29364 3.03099 6.00301 1.54736 11.31532 3.51375
5204 2.19483 7.48286 3.71677 8.08137 2.41192 4.18171 6.33081 4.14324 4.77651 3.58364 ... 3.70008 8.09488 1.62875 7.22072 7.60655 2.75521 5.66704 1.37287 11.34128 4.01682
5652 4.39618 7.18781 3.34626 8.07230 2.48652 4.73700 6.30346 4.30440 4.80531 4.52920 ... 3.17883 7.34854 2.18256 6.92764 6.88023 2.53110 5.49423 1.47046 11.30252 6.46284
NC029 2.16753 6.15583 2.89508 7.61839 2.33290 3.67643 4.26698 2.86324 3.78789 2.50139 ... 2.66940 5.85028 1.52451 6.16698 5.67676 2.08815 5.05372 1.13894 11.27247 3.44996

5 rows × 92 columns

In [20]:
df_Plasma_neuro = pd.read_csv('data/Neurology_Plasma_Etter.csv',sep=';',skiprows=[0,1,2,4,5,6] , index_col=0)

df_Plasma_neuro.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)
df_Plasma_neuro_metadatas = df_Plasma_neuro.loc[ ['LOD','Missing Data freq.','Normalization'] , :].copy()
df_Plasma_neuro.drop(index=['LOD','Missing Data freq.','Normalization', np.nan] , inplace=True)

for m in neuroMolecules:
    df_Plasma_neuro[m] = df_Plasma_neuro[m].astype(float)
df_Plasma_neuro.head()
Out[20]:
NMNAT1 NRP2 MAPT CADM3 GDNF UNC5C VWC2 Siglec-9 CLM-6 EZR ... Dkk-4 EDA2R LAT NTRK3 LAIR-2 MANF TN-R CD200R1 Nr-CAM KYNU
Assay
5097 3.53954 8.05956 0.05508 3.31939 1.98270 4.23206 5.14522 4.84619 6.17331 5.53970 ... 3.64636 3.11506 6.44210 7.00653 4.22533 8.72360 2.44923 5.89832 9.58219 9.53344
NC020 3.94164 8.12603 0.61754 4.03956 1.95130 5.46608 6.32928 4.38866 6.38791 5.99036 ... 4.43752 5.06298 8.45497 6.49427 4.65111 8.99031 2.95393 4.61084 9.26496 9.00088
NC012 3.32455 8.06942 0.24521 2.49502 2.59845 4.43254 5.50223 5.14197 6.13750 5.59433 ... 3.97797 3.86889 8.31780 6.58391 4.85584 9.02627 2.73757 4.99941 9.49777 9.02388
NC033 3.76958 7.98193 0.20047 2.17538 1.82331 4.09080 4.36522 4.73051 6.22299 6.31782 ... 2.47930 3.09762 8.01145 6.13347 4.33199 8.89879 3.28244 4.72476 9.20661 10.43829
NC007 6.55468 8.38420 0.73049 4.90898 3.14657 5.39288 5.85140 5.48657 6.95294 6.34842 ... 5.23084 6.56718 5.05211 7.11924 6.38258 8.13460 3.58866 4.79933 9.91363 8.36898

5 rows × 92 columns

In [21]:
df_Plasma_inf = pd.read_csv('data/Inflammation_Plasma_Etter.csv',sep=';',skiprows=[0,1,2,4,5,6] , index_col=0)

df_Plasma_inf.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)
df_Plasma_inf_metadatas = df_Plasma_inf.loc[ ['LOD','Missing Data freq.','Normalization'] , :].copy()
df_Plasma_inf.drop(index=['LOD','Missing Data freq.','Normalization', np.nan] , inplace=True)

for m in inflammationMolecules:
    df_Plasma_inf[m] = df_Plasma_inf[m].astype(float)
df_Plasma_inf.head()
Out[21]:
IL8 VEGFA CD8A MCP-3 GDNF CDCP1 CD244 IL7 OPG LAP TGF-beta-1 ... TNFRSF9 NT-3 TWEAK CCL20 ST1A1 STAMBP IL5 ADA TNFB CSF-1
Assay
5097 3.98525 10.12451 9.25435 1.30819 1.42985 1.87413 5.19681 1.30504 9.53484 4.97770 ... 5.70371 2.22839 7.65602 5.61042 1.15275 4.42246 -0.24458 4.22358 4.01943 9.28920
NC020 5.25480 11.83035 8.94074 3.04921 1.49920 1.64415 5.09875 2.50616 10.94577 6.17284 ... 6.67353 1.82242 7.55233 8.06364 1.92509 6.52342 0.40179 5.86069 3.61788 9.70047
NC012 4.43852 10.10268 9.56985 1.28115 1.81209 2.07791 5.37016 1.47176 9.58873 5.10894 ... 5.64431 1.31701 7.53543 7.37198 2.03079 5.29281 -0.00082 5.15287 3.10043 9.07734
NC033 4.90626 10.41222 9.34333 2.05347 0.79792 1.61883 5.18985 2.27891 9.71240 5.25125 ... 6.01083 1.30448 7.39177 7.68204 2.04610 6.04606 2.07324 5.38520 3.35968 9.38988
NC007 7.31225 11.75040 8.74524 3.48664 2.18185 3.37656 5.64090 2.82271 11.02299 6.07137 ... 7.32957 0.89025 8.76430 6.28545 1.28180 3.43511 -0.37574 4.29185 4.21954 9.95298

5 rows × 92 columns

Are the index the same ?

In [22]:
plasmaSamples = set(df_Plasma_inf.index)
csfSamples = set(df_CSF_inf.index)
print( 'csfSamples-plasmaSamples',csfSamples-plasmaSamples )
print( 'plasmaSamples-csfSamples' , plasmaSamples-csfSamples )
csfSamples-plasmaSamples set()
plasmaSamples-csfSamples {'NC032', 'NC014', 'NC021', 'NC001', 'NC017'}

4 missing CSF sample (expected from the metadata)

Are the CSF and plasma measurements correlated ?

In [23]:
# for c in inflammationMolecules:
    
#     x = df_Plasma_inf.loc[df_CSF_inf.index , c]
#     y = df_CSF_inf.loc[df_CSF_inf.index , c]
#     r,p = stats.pearsonr(list(x),list(y))
    
#     fig,ax = plt.subplots(1,1)
#     ax.plot(x,y,'o')
#     ax.set_xlabel("plasma")
#     ax.set_ylabel("CSF")
#     ax.set_title( '{} NPX - pearson correlation : {:.2f} '.format(c,r) )
    
#     fig.savefig('images/preliminary/'+c+'.plasma_CSF.png')
    
In [24]:
# for c in neuroMolecules:
    
#     x = df_Plasma_neuro.loc[df_CSF_inf.index , c]
#     y = df_CSF_neuro.loc[df_CSF_inf.index , c]
#     r,p = stats.pearsonr(list(x),list(y))
    
#     fig,ax = plt.subplots(1,1)
#     ax.plot(x,y,'o')
#     ax.set_xlabel("plasma")
#     ax.set_ylabel("CSF")
#     ax.set_title( '{} NPX - pearson correlation : {:.2f} '.format(c,r) )
    
#     fig.savefig('images/preliminary/'+c+'.plasma_CSF.png')
    

1.2. read metadata

In [48]:
df_metadata = pd.read_csv('data/metadata.csv', index_col='studyID')
df_metadata.covid = df_metadata.covid.astype(bool)
df_metadata.Age = df_metadata.Age.astype(int)
df_metadata.head()
Out[48]:
Age Sex Group Stage covid
studyID
4927 38 f healthy control I False
5050 53 m healthy control I False
5052 22 m healthy control I False
5062 23 m healthy control I False
5092 47 f healthy control I False

2. cleaning the data

2.1. applying detection threshold

In [26]:
DF = df_CSF_inf
DFMD = df_CSF_inf_metadatas
MOL = inflammationMolecules

if DFMD.shape[0]==3 :
    DFMD = DFMD.transpose()
DFMD.LOD = DFMD.LOD.astype(float)

underThreshold = DF<DFMD.LOD

DF[underThreshold] = np.nan

N = underThreshold.shape[0] * underThreshold.shape[1]
n = underThreshold.sum().sum()
print("number of measurment under detection threshold : {} / {}".format(n,N) )
fig,axes = plt.subplots(1,3,figsize=(16,8))

axes[0].hist(underThreshold.sum())
axes[0].set_title('NAs per molecule')

axes[1].hist(underThreshold.sum(axis=1))
axes[1].set_title('NAs per sample')

axes[2].scatter( DF.isnull().sum() / DF.shape[0] , DF.median() )
axes[2].set_ylabel('median NPX')
axes[2].set_xlabel('fraction of NAs')
number of measurment under detection threshold : 2840 / 7820
Out[26]:
Text(0.5, 0, 'fraction of NAs')
In [27]:
DF = df_Plasma_inf
DFMD = df_Plasma_inf_metadatas
MOL = inflammationMolecules

if DFMD.shape[0]==3 :
    DFMD = DFMD.transpose()
DFMD.LOD = DFMD.LOD.astype(float)

underThreshold = DF<DFMD.LOD

DF[underThreshold] = np.nan

N = underThreshold.shape[0] * underThreshold.shape[1]
n = underThreshold.sum().sum()
print("number of measurment under detection threshold : {} / {}".format(n,N) )
fig,axes = plt.subplots(1,3,figsize=(16,8))

axes[0].hist(underThreshold.sum())
axes[0].set_title('NAs per molecule')

axes[1].hist(underThreshold.sum(axis=1))
axes[1].set_title('NAs per sample')

axes[2].scatter( DF.isnull().sum() / DF.shape[0] , DF.median() )
axes[2].set_ylabel('median NPX')
axes[2].set_xlabel('fraction of NAs')
number of measurment under detection threshold : 2131 / 8280
Out[27]:
Text(0.5, 0, 'fraction of NAs')
In [28]:
DF = df_CSF_neuro
DFMD = df_CSF_neuro_metadatas
MOL = neuroMolecules

if DFMD.shape[0]==3 :
    DFMD = DFMD.transpose()
DFMD.LOD = DFMD.LOD.astype(float)

underThreshold = DF<DFMD.LOD

DF[underThreshold] = np.nan

N = underThreshold.shape[0] * underThreshold.shape[1]
n = underThreshold.sum().sum()
print("number of measurment under detection threshold : {} / {}".format(n,N) )
fig,axes = plt.subplots(1,3,figsize=(16,8))

axes[0].hist(underThreshold.sum())
axes[0].set_title('NAs per molecule')

axes[1].hist(underThreshold.sum(axis=1))
axes[1].set_title('NAs per sample')

axes[2].scatter( DF.isnull().sum() / DF.shape[0] , DF.median() )
axes[2].set_ylabel('median NPX')
axes[2].set_xlabel('fraction of NAs')
number of measurment under detection threshold : 921 / 7820
Out[28]:
Text(0.5, 0, 'fraction of NAs')
In [29]:
DF = df_Plasma_neuro
DFMD = df_Plasma_neuro_metadatas
MOL = neuroMolecules

if DFMD.shape[0]==3 :
    DFMD = DFMD.transpose()
DFMD.LOD = DFMD.LOD.astype(float)

underThreshold = DF<DFMD.LOD

DF[underThreshold] = np.nan

N = underThreshold.shape[0] * underThreshold.shape[1]
n = underThreshold.sum().sum()
print("number of measurment under detection threshold : {} / {}".format(n,N) )
fig,axes = plt.subplots(1,3,figsize=(16,8))

axes[0].hist(underThreshold.sum())
axes[0].set_title('NAs per molecule')

axes[1].hist(underThreshold.sum(axis=1))
axes[1].set_title('NAs per sample')

axes[2].scatter( DF.isnull().sum() / DF.shape[0] , DF.median() )
axes[2].set_ylabel('median NPX')
axes[2].set_xlabel('fraction of NAs')
number of measurment under detection threshold : 352 / 8280
Out[29]:
Text(0.5, 0, 'fraction of NAs')

The inflammatory panel contains much more measurments under the detection thresholds than the neuro panel.

Also, as experienced in previous experiments the fraction of NAs are associated with a lower median expression.

Proposed strategy :

  • imput the missing NAs as either half the detection threshold OR such that the sum of all imputed values on the column is 0.1 of the column values sum (whichever is the smallest)

We will first drop the columns which have no non-NAs:

In [42]:
def imputeOLINKdata( DF, DFMD ,ax):


    if DFMD.shape[0]==3 :
        DFMD = DFMD.transpose()
    DFMD.LOD = DFMD.LOD.astype(float)


    toDrop = list(  DF.columns[ (DF.isnull().sum() / DF.shape[1]) == 1 ]  )
    if toDrop == 1 :
        print('dropping columns', toDrop)
        DF.drop(columns = toDrop , inplace=True)
    else:
        print('no columns to drop')


    I1 = ( DF.sum() * 0.1/0.9 ) / DF.isnull().sum()
    I2 = 0.5 * DFMD.loc[ DF.columns , "LOD"]
    imputationValues = np.minimum( I1 , I2 ) 

    ax.scatter( DF.isnull().sum() / DF.shape[0], I1 , label="NA sum == 0.1 colSum")
    ax.scatter( DF.isnull().sum() / DF.shape[0] , I2 , label = "0.5 * detection threshold" )
    ax.set_ylim(0.0,10.0)
    ax.set_xlabel( "NAs fraction" )
    ax.set_ylabel( "imputation NPX" )
    ax.legend()

    DFImputed = DF.copy()

    DFImputed.fillna(imputationValues  , inplace = True)

    #print( 'number of NAs after imputation: ' , DFImputed.isnull().sum().sum() )
    return DFImputed
In [44]:
fig,axes = plt.subplots(2,2,figsize = (14,10))

df_Plasma_neuro_imputed = imputeOLINKdata( df_Plasma_neuro, df_Plasma_neuro_metadatas ,axes[0][0])
axes[0][0].set_title("Plasma - neuro")

df_Plasma_inf_imputed = imputeOLINKdata( df_Plasma_inf, df_Plasma_inf_metadatas ,axes[0][1])
axes[0][1].set_title("Plasma - inflammatory")

df_CSF_neuro_imputed = imputeOLINKdata( df_CSF_neuro, df_CSF_neuro_metadatas ,axes[1][0])
axes[1][0].set_title("CSF - neuro")

df_CSF_inf_imputed = imputeOLINKdata( df_CSF_inf, df_CSF_inf_metadatas ,axes[1][1])
axes[1][1].set_title("CSF - inflammatory")
no columns to drop
no columns to drop
no columns to drop
no columns to drop
Out[44]:
Text(0.5, 1.0, 'CSF - inflammatory')

3. preliminary visualizations

3.1. Plasma and CSF correlation?

In [53]:
for c in inflammationMolecules:
    
    x = df_Plasma_inf_imputed.loc[df_CSF_inf_imputed.index , c]
    y = df_CSF_inf_imputed.loc[df_CSF_inf_imputed.index , c]
    r,p = stats.pearsonr(list(x),list(y))
    t,p = stats.kendalltau(list(x),list(y))
    
    fig,ax = plt.subplots(1,1)
    sns.scatterplot( x=x,y=y, hue= df_metadata.Group[ df_CSF_inf_imputed.index ] , ax = ax)
    #ax.plot(x,y,'o')
    ax.set_xlabel("plasma")
    ax.set_ylabel("CSF")
    ax.set_title( '{} NPX - pearson : {:.2f} - kendall : {:.2f} '.format(c,r,t) )
    
    fig.savefig('images/imputed/inflammatory_'+c+'.plasma_CSF.png')
    
<ipython-input-53-47095bdbaee0>:8: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig,ax = plt.subplots(1,1)
/home/wandrille/Installed_software/anaconda3/envs/py38/lib/python3.8/site-packages/scipy/stats/stats.py:3913: PearsonRConstantInputWarning: An input array is constant; the correlation coefficent is not defined.
  warnings.warn(PearsonRConstantInputWarning())
/home/wandrille/Installed_software/anaconda3/envs/py38/lib/python3.8/site-packages/scipy/stats/stats.py:3913: PearsonRConstantInputWarning: An input array is constant; the correlation coefficent is not defined.
  warnings.warn(PearsonRConstantInputWarning())
/home/wandrille/Installed_software/anaconda3/envs/py38/lib/python3.8/site-packages/scipy/stats/stats.py:3913: PearsonRConstantInputWarning: An input array is constant; the correlation coefficent is not defined.
  warnings.warn(PearsonRConstantInputWarning())
/home/wandrille/Installed_software/anaconda3/envs/py38/lib/python3.8/site-packages/scipy/stats/stats.py:3913: PearsonRConstantInputWarning: An input array is constant; the correlation coefficent is not defined.
  warnings.warn(PearsonRConstantInputWarning())
/home/wandrille/Installed_software/anaconda3/envs/py38/lib/python3.8/site-packages/scipy/stats/stats.py:3913: PearsonRConstantInputWarning: An input array is constant; the correlation coefficent is not defined.
  warnings.warn(PearsonRConstantInputWarning())
In [54]:
for c in neuroMolecules:
    
    x = df_Plasma_neuro_imputed.loc[df_CSF_neuro_imputed.index , c]
    y = df_CSF_neuro_imputed.loc[df_CSF_neuro_imputed.index , c]
    r,p = stats.pearsonr(list(x),list(y))
    t,p = stats.kendalltau(list(x),list(y))
    
    fig,ax = plt.subplots(1,1)
    sns.scatterplot( x=x,y=y, hue= df_metadata.Group[ df_CSF_neuro_imputed.index ] , ax = ax)
    #ax.plot(x,y,'o')
    ax.set_xlabel("plasma")
    ax.set_ylabel("CSF")
    ax.set_title( '{} NPX - pearson : {:.2f} - kendall : {:.2f} '.format(c,r,t) )
    
    fig.savefig('images/imputed/neuro_'+c+'.plasma_CSF.png')
    
<ipython-input-54-8652cd8d1075>:8: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig,ax = plt.subplots(1,1)

It would appear that for most markers the correlation is present but not very strong.

Some markers seems to have different discriminating abilities depending on whether the sample comes from plasma or CSF.

Some columns have a very high number of NAs... I should be especially careful with these and any conclusions that may hang on them.

3.2. PCA

Just to get a look at the data and its structure

3.2.1 CSF neuro

In [131]:
from sklearn import decomposition
from sklearn import preprocessing

data = df_CSF_neuro_imputed

# scale
scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)

pca = decomposition.PCA(n_components=20)
pca.fit(data_scaled)

data_pca_transformed = pca.transform(data_scaled)
In [132]:
fig, axes = plt.subplots(2,1 , figsize = (14,7))
axes[0].plot( pca.explained_variance_ratio_ , marker = 'o') 
axes[0].set_xlabel('number of components')
axes[0].set_ylabel('fraction explained variance')
axes[1].plot( np.cumsum( pca.explained_variance_ratio_ ) , marker = 'o') 
axes[1].axhline(0.8 , color = 'grey')
axes[1].set_xlabel('number of components')
axes[1].set_ylabel('fraction explained variance (cumulative)')
Out[132]:
Text(0, 0.5, 'fraction explained variance (cumulative)')
In [133]:
data_pca = pd.DataFrame ( data_pca_transformed )
data_pca.index = data.index
data_pca = data_pca.join(df_metadata.loc[ data_pca.index ] )
import plotly.express as px

fig = px.scatter(data_pca , x= 0 , y=1 , 
                  color = 'Group' , symbol = 'Sex',
                 hover_data = ['Age','Sex','Stage','Group'] )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_neuro.html")
In [134]:
f, ax = plt.subplots(figsize=(16, 6))
sns.heatmap( pca.components_ , ax = ax , xticklabels = data.columns , cmap ='viridis')
Out[134]:
<AxesSubplot:>
In [135]:
fig = px.scatter_3d(data_pca , x= 0 , y=1 , z=2,
                 color = 'Group' , symbol = 'Sex',
                 hover_data = ['Age','Sex','Stage','Group'] , 
                 labels = {'0':"PCA 0 - {:.2f}".format(pca.explained_variance_ratio_[0]) , 
                           '1':"PCA 1 - {:.2f}".format(pca.explained_variance_ratio_[1]) , 
                           '2':"PCA 2 - {:.2f}".format(pca.explained_variance_ratio_[2]) } )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_neuro_3D.html")
print( "showing {:.3f} of the variance".format( sum( pca.explained_variance_ratio_[:3] ) ) )
showing 0.666 of the variance

3.2.2 Plasma neuro

In [136]:
from sklearn import decomposition
from sklearn import preprocessing

data = df_Plasma_neuro_imputed

# scale
scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)

pca = decomposition.PCA(n_components=20)
pca.fit(data_scaled)

data_pca_transformed = pca.transform(data_scaled)
In [137]:
fig, axes = plt.subplots(2,1 , figsize = (14,7))
axes[0].plot( pca.explained_variance_ratio_ , marker = 'o') 
axes[0].set_xlabel('number of components')
axes[0].set_ylabel('fraction explained variance')
axes[1].plot( np.cumsum( pca.explained_variance_ratio_ ) , marker = 'o') 
axes[1].axhline(0.8 , color = 'grey')
axes[1].set_xlabel('number of components')
axes[1].set_ylabel('fraction explained variance (cumulative)')
Out[137]:
Text(0, 0.5, 'fraction explained variance (cumulative)')
In [138]:
data_pca = pd.DataFrame ( data_pca_transformed )
data_pca.index = data.index
data_pca = data_pca.join(df_metadata.loc[ data_pca.index ] )
import plotly.express as px

fig = px.scatter(data_pca , x= 0 , y=1 , 
                  color = 'covid' , 
                 hover_data = ['Age','Sex','Stage','Group'] )
fig.show()
fig.write_html("interactive_figures/PCA_Plasma_neuro.html")
In [139]:
f, ax = plt.subplots(figsize=(16, 6))
sns.heatmap( pca.components_ , ax = ax , xticklabels = data.columns , cmap ='viridis')
Out[139]:
<AxesSubplot:>
In [140]:
fig = px.scatter_3d(data_pca , x= 0 , y=1 , z=2,
                 color = 'Group' , symbol = 'Sex',
                 hover_data = ['Age','Sex','Stage','Group'] , 
                 labels = {'0':"PCA 0 - {:.2f}".format(pca.explained_variance_ratio_[0]) , 
                           '1':"PCA 1 - {:.2f}".format(pca.explained_variance_ratio_[1]) , 
                           '2':"PCA 2 - {:.2f}".format(pca.explained_variance_ratio_[2]) } )
fig.show()
fig.write_html("interactive_figures/PCA_Plasma_neuro_3D.html")
print( "showing {:.3f} of the variance".format( sum( pca.explained_variance_ratio_[:3] ) ) )
showing 0.474 of the variance

3.2.3 CSF inflammatory

In [141]:
from sklearn import decomposition
from sklearn import preprocessing

data = df_CSF_inf_imputed

# scale
scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)

pca = decomposition.PCA(n_components=20)
pca.fit(data_scaled)

data_pca_transformed = pca.transform(data_scaled)
In [142]:
fig, axes = plt.subplots(2,1 , figsize = (14,7))
axes[0].plot( pca.explained_variance_ratio_ , marker = 'o') 
axes[0].set_xlabel('number of components')
axes[0].set_ylabel('fraction explained variance')
axes[1].plot( np.cumsum( pca.explained_variance_ratio_ ) , marker = 'o') 
axes[1].axhline(0.8 , color = 'grey')
axes[1].set_xlabel('number of components')
axes[1].set_ylabel('fraction explained variance (cumulative)')
Out[142]:
Text(0, 0.5, 'fraction explained variance (cumulative)')
In [143]:
data_pca = pd.DataFrame ( data_pca_transformed )
data_pca.index = data.index
data_pca = data_pca.join(df_metadata.loc[ data_pca.index ] )
import plotly.express as px

fig = px.scatter(data_pca , x= 0 , y=1 , 
                  color = 'Group' , 
                 hover_data = ['Age','Sex','Stage','Group'] )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_inf.html")
In [144]:
f, ax = plt.subplots(figsize=(16, 6))
sns.heatmap( pca.components_ , ax = ax , xticklabels = data.columns , cmap ='viridis')
Out[144]:
<AxesSubplot:>
In [145]:
fig = px.scatter_3d(data_pca , x= 0 , y=1 , z=2,
                 color = 'Group' , symbol = 'Sex',
                 hover_data = ['Age','Sex','Stage','Group'] , 
                 labels = {'0':"PCA 0 - {:.2f}".format(pca.explained_variance_ratio_[0]) , 
                           '1':"PCA 1 - {:.2f}".format(pca.explained_variance_ratio_[1]) , 
                           '2':"PCA 2 - {:.2f}".format(pca.explained_variance_ratio_[2]) } )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_inflammatory_3D.html")
print( "showing {:.3f} of the variance".format( sum( pca.explained_variance_ratio_[:3] ) ) )
showing 0.633 of the variance

3.2.3 CSF neuro

In [146]:
from sklearn import decomposition
from sklearn import preprocessing

data = df_CSF_neuro_imputed

# scale
scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)

pca = decomposition.PCA(n_components=20)
pca.fit(data_scaled)

data_pca_transformed = pca.transform(data_scaled)
In [147]:
fig, axes = plt.subplots(2,1 , figsize = (14,7))
axes[0].plot( pca.explained_variance_ratio_ , marker = 'o') 
axes[0].set_xlabel('number of components')
axes[0].set_ylabel('fraction explained variance')
axes[1].plot( np.cumsum( pca.explained_variance_ratio_ ) , marker = 'o') 
axes[1].axhline(0.8 , color = 'grey')
axes[1].set_xlabel('number of components')
axes[1].set_ylabel('fraction explained variance (cumulative)')
Out[147]:
Text(0, 0.5, 'fraction explained variance (cumulative)')
In [148]:
data_pca = pd.DataFrame ( data_pca_transformed )
data_pca.index = data.index
data_pca = data_pca.join(df_metadata.loc[ data_pca.index ] )
import plotly.express as px

fig = px.scatter(data_pca , x= 0 , y=1 , 
                  color = 'Group' , 
                 hover_data = ['Age','Sex','Stage','Group'] )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_neuro.html")
In [149]:
f, ax = plt.subplots(figsize=(16, 6))
sns.heatmap( pca.components_ , ax = ax , xticklabels = data.columns , cmap ='viridis')
Out[149]:
<AxesSubplot:>
In [150]:
fig = px.scatter_3d(data_pca , x= 0 , y=1 , z=2,
                 color = 'Group' , symbol = 'Sex',
                 hover_data = ['Age','Sex','Stage','Group'] , 
                 labels = {'0':"PCA 0 - {:.2f}".format(pca.explained_variance_ratio_[0]) , 
                           '1':"PCA 1 - {:.2f}".format(pca.explained_variance_ratio_[1]) , 
                           '2':"PCA 2 - {:.2f}".format(pca.explained_variance_ratio_[2]) } )
fig.show()
fig.write_html("interactive_figures/PCA_CSF_neuro_3D.html")
print( "showing {:.3f} of the variance".format( sum( pca.explained_variance_ratio_[:3] ) ) )
showing 0.666 of the variance

3.3. looking at common columns

In [161]:
commonMarkers = list ( set( df_CSF_neuro_imputed.columns ).intersection( df_CSF_inf_imputed.columns ) )
commonMarkers
Out[161]:
['GDNF', 'Beta-NGF']
In [163]:
fig, axes = plt.subplots(2,len(commonMarkers) , figsize = (14,14) )

for i,c in enumerate(commonMarkers):
    
    axes[0][i].plot( df_CSF_neuro_imputed.loc[df_CSF_neuro_imputed.index , c] ,
                     df_CSF_inf_imputed.loc[df_CSF_neuro_imputed.index , c] , 'o')
    axes[0][i].set_xlabel('CSF neuro '+c)
    axes[0][i].set_ylabel('CSF inflammatory '+c)
    
    axes[1][i].plot( df_Plasma_neuro_imputed.loc[df_Plasma_neuro_imputed.index , c] ,
                     df_Plasma_inf_imputed.loc[df_Plasma_neuro_imputed.index , c] ,  'o' )
    axes[1][i].set_xlabel('Plasma neuro '+c)
    axes[1][i].set_ylabel('Plasma inflammatory '+c)

These are heavily imputed columns in both cases, so this is not very informative ...

4. merging and writing

4.1. merging

In [165]:
## let's keep only the neuro copy of the 2 common markers
inflammationMoleculesOfInterest = inflammationMolecules.copy()
for c in commonMarkers:
    inflammationMoleculesOfInterest.remove(c)
In [172]:
tmpCSF = pd.concat( [ df_CSF_neuro_imputed, df_CSF_inf_imputed.loc[:,inflammationMoleculesOfInterest]  ] , axis = 1 )
tmpCSF.columns = 'CSF_' + tmpCSF.columns
print(tmpCSF.shape)
tmpCSF.head()
(85, 182)
Out[172]:
CSF_NMNAT1 CSF_NRP2 CSF_MAPT CSF_CADM3 CSF_GDNF CSF_UNC5C CSF_VWC2 CSF_Siglec-9 CSF_CLM-6 CSF_EZR ... CSF_TNFRSF9 CSF_NT-3 CSF_TWEAK CSF_CCL20 CSF_ST1A1 CSF_STAMBP CSF_IL5 CSF_ADA CSF_TNFB CSF_CSF-1
Assay
6346 0.693481 7.09192 3.77791 7.91794 0.014267 4.42140 6.47501 4.05874 4.45070 3.10702 ... 2.48836 0.00714 9.19952 1.46232 0.029452 0.73271 0.013983 3.59264 1.27559 8.16527
4995 2.343750 7.47933 4.56933 8.15273 2.822150 4.79441 7.15407 4.73644 5.04218 3.18018 ... 3.54916 1.80602 9.73808 2.13246 1.652590 1.16748 1.908120 3.87319 2.47669 8.96652
5204 2.194830 7.48286 3.71677 8.08137 0.014267 4.18171 6.33081 4.14324 4.77651 3.58364 ... 2.52795 0.00714 9.41759 1.52814 0.029452 0.93010 0.013983 4.14049 1.43471 8.33000
5652 4.396180 7.18781 3.34626 8.07230 0.014267 4.73700 6.30346 4.30440 4.80531 4.52920 ... 5.17659 0.00714 8.73457 1.69590 0.029452 1.13360 0.013983 4.20414 4.90506 8.23255
NC029 2.167530 6.15583 2.89508 7.61839 0.014267 3.67643 4.26698 2.86324 3.78789 2.50139 ... 1.87317 0.00714 8.18665 1.47131 0.029452 0.09874 0.013983 3.29188 1.04656 7.56799

5 rows × 182 columns

In [173]:
tmpPlasma = pd.concat( [ df_Plasma_neuro_imputed, df_Plasma_inf_imputed.loc[:,inflammationMoleculesOfInterest]  ] , axis = 1 )
tmpPlasma.columns = 'Plasma_' + tmpPlasma.columns
print(tmpPlasma.shape)
tmpPlasma.head()
(90, 182)
Out[173]:
Plasma_NMNAT1 Plasma_NRP2 Plasma_MAPT Plasma_CADM3 Plasma_GDNF Plasma_UNC5C Plasma_VWC2 Plasma_Siglec-9 Plasma_CLM-6 Plasma_EZR ... Plasma_TNFRSF9 Plasma_NT-3 Plasma_TWEAK Plasma_CCL20 Plasma_ST1A1 Plasma_STAMBP Plasma_IL5 Plasma_ADA Plasma_TNFB Plasma_CSF-1
Assay
5097 3.53954 8.05956 0.007075 3.31939 0.085808 4.23206 5.14522 4.84619 6.17331 5.53970 ... 5.70371 2.228390 7.65602 5.61042 0.181655 4.42246 0.014006 4.22358 4.01943 9.28920
NC020 3.94164 8.12603 0.007075 4.03956 0.085808 5.46608 6.32928 4.38866 6.38791 5.99036 ... 6.67353 1.822420 7.55233 8.06364 1.925090 6.52342 0.014006 5.86069 3.61788 9.70047
NC012 3.32455 8.06942 0.007075 2.49502 2.598450 4.43254 5.50223 5.14197 6.13750 5.59433 ... 5.64431 0.289609 7.53543 7.37198 2.030790 5.29281 0.014006 5.15287 3.10043 9.07734
NC033 3.76958 7.98193 0.007075 2.17538 0.085808 4.09080 4.36522 4.73051 6.22299 6.31782 ... 6.01083 0.289609 7.39177 7.68204 2.046100 6.04606 2.073240 5.38520 3.35968 9.38988
NC007 6.55468 8.38420 0.007075 4.90898 3.146570 5.39288 5.85140 5.48657 6.95294 6.34842 ... 7.32957 0.289609 8.76430 6.28545 0.181655 3.43511 0.014006 4.29185 4.21954 9.95298

5 rows × 182 columns

In [181]:
total = pd.concat( [tmpCSF , tmpPlasma] ,axis=1)
total.shape
Out[181]:
(90, 364)

4.2. writing

In [182]:
df_CSF_inf_imputed.to_csv( 'imputed_data/Inflammation_CSF_imputed.csv' )
df_CSF_neuro_imputed.to_csv('imputed_data/Neurology_CSF_imputed.csv')
df_Plasma_inf_imputed.to_csv( 'imputed_data/Inflammation_Plasma_imputed.csv' )
df_Plasma_neuro_imputed.to_csv('imputed_data/Neurology_Plasma_imputed.csv')
In [183]:
total.to_csv( 'imputed_data/ALL_imputed.csv' )